Data Wrangling for Graphics

Author

Felicia Susanto

library(vegawidget)
library(readr)
library(dplyr)
library(vegalite)
library(lubridate)
library(vegabrite)
library(tidyr)
library(jsonlite)

Exercise 1 (supply and demand)

supply_demand <- read_csv("https://calvin-data304.netlify.app/data/swd-lets-practice-ex-2-3.csv")

supply_demand |>
  pivot_longer(demand:capacity, names_to = "type", values_to = "number")
# A tibble: 24 × 3
   date    type     number
   <chr>   <chr>     <dbl>
 1 2019-04 demand    46193
 2 2019-04 capacity  29263
 3 2019-05 demand    49131
 4 2019-05 capacity  28037
 5 2019-06 demand    50124
 6 2019-06 capacity  21596
 7 2019-07 demand    48850
 8 2019-07 capacity  25895
 9 2019-08 demand    47602
10 2019-08 capacity  25813
# ℹ 14 more rows
supply_demand2 <- supply_demand |>
  mutate(unmet_demand = demand - capacity)
 # pivot_longer(capacity:unmet_demand, names_to = "type", values_to = "number")
vl_chart(supply_demand) |>
  vl_fold(
    c("capacity", "demand"),
    as = c("type", "value")) |>
  vl_mark_bar() |>
  vl_encode_x("date:O") |>
  vl_encode_xOffset("type:N") |>
  # vl_facet_column("type:N") |>
  vl_encode_y("value:Q") |>
  vl_encode_color("type:N", scale = list(domain = c("demand", "capacity")))
vl_chart(supply_demand) |>
  vl_fold(
    c("capacity", "demand"),
    as = c("type", "value")) |>
  vl_mark_line(point = TRUE) |>
  vl_encode_x("date:T") |>
  vl_encode_xOffset("type:N") |>
  vl_encode_y("value:Q") |>
  vl_encode_color("type:N", scale = list(domain = c("demand", "capacity")))
demandplot <- vl_chart(supply_demand) |>
  vl_mark_bar() |>
  vl_encode_x("date:T") |>
  vl_encode_y("demand:Q") |>
  vl_encode_color(value = "red")

capacityplot <- vl_chart(supply_demand) |>
  vl_mark_bar(width = 10) |>
  vl_encode_x("date:T") |>
  vl_encode_y("capacity:Q") |>
  vl_encode_color(value = "blue") |>
  vl_encode_opacity(value = 0.7)

vl_layer(demandplot, capacityplot)
demandplot <- vl_chart(supply_demand) |>
  vl_mark_bar() |>
  vl_encode_x("date:T") |>
  vl_encode_y("demand:Q") |>
  vl_encode_color(value = "red", title = "unmet demands")
Warning: Invalid schema for object passed to or created by
modify_inner_spec.vegaspec_unit
capacityplot <- vl_chart(supply_demand) |>
  vl_mark_bar() |>
  vl_encode_x("date:T") |>
  vl_encode_y("capacity:Q") |>
  vl_encode_color(value = "blue", title = "capacity") |>
  vl_encode_opacity(value = 0.7)
Warning: Invalid schema for object passed to or created by
modify_inner_spec.vegaspec_unit
vl_layer(demandplot, capacityplot)
demandplot <- vl_chart(supply_demand) |>
  vl_mark_point() |>
  vl_encode_x("date:T") |>
  vl_encode_y("demand:Q") |>
  vl_encode_color(value = "red")

capacityplot <- vl_chart(supply_demand) |>
  vl_mark_point() |>
  vl_encode_x("date:T") |>
  vl_encode_y("capacity:Q") |>
  vl_encode_color(value = "blue")

lineplot <- vl_chart(supply_demand) |>
  vl_fold(
  c("capacity", "demand"),
  as = c("type", "value")) |>
  vl_mark_line() |>
  vl_encode_x("date:T") |>
  vl_encode_y("value:Q") |>
  vl_encode_detail("date:T")

vl_layer(demandplot, capacityplot, lineplot)
vl_chart(supply_demand2) |>
  vl_mark_line(point = TRUE) |>
  vl_encode_x("date:T") |>
  vl_encode_y("unmet_demand:Q")

Excercise 2 (jobs)

jobs_url <- 
  "https://cdn.jsdelivr.net/npm/vega-datasets@2.8.0/data/jobs.json"


#| fig-cap: "This is a caption for the figure."
vl_chart() |>
  vl_mark_point() |>
  vl_pivot("year", groupby = list("job", "sex"), value = "perc") |>
  vl_encode_x("1850:Q") |>
  vl_scale_x(type = "symlog", constant = 0.00001) |>  # note: scale must be set quite small
  vl_encode_y("2000:Q") |>
  vl_scale_y(type = "symlog", constant = 0.00001) |>  # note: scale must be set quite small
  vl_facet_column("sex:N", title = "") |>
  vl_encode_tooltip_array(list("job", "sex", "1850", "2000")) |>
  vl_add_properties(
    title =  list( 
      text = "Percent of people working various jobs",
      subtitle = "Each dot represents the one occupation. Hover to see which occupation it is."
      )) |>
  vl_add_data_url(jobs_url) 

“perc” means?

sum all values of perc in each year separated by men and women… explain what perc means

jobs_data <- fromJSON("https://cdn.jsdelivr.net/npm/vega-datasets@2.8.0/data/jobs.json")
jobs_summary <- jobs_data %>%
  group_by(year, sex) %>%
  summarise(total_perc = sum(perc, na.rm = TRUE))
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
glimpse(jobs_summary)
Rows: 30
Columns: 3
Groups: year [15]
$ year       <int> 1850, 1850, 1860, 1860, 1870, 1870, 1880, 1880, 1900, 1900,…
$ sex        <chr> "men", "women", "men", "women", "men", "women", "men", "wom…
$ total_perc <dbl> 0.990373464, 0.009626536, 0.856455691, 0.143544309, 0.84560…

Looking at the data, the “perc” values add up to one for each year. This means that if we sum the percentages of men and women for the same year, the total will always be one. In other words, “perc” represents the proportion of jobs each year (where the total is 1, not 100%) for both men and women. For example, in 1850, 0.990373464 (or 99%) of workers were men, while 0.009626536 (or 1%) were women. This makes sense, as most women were likely housewives at that time.

modifying graphic

jobs_data_transformed <- jobs_data |>
  pivot_wider(names_from = sex, values_from = perc, id_cols = c(job, year))

# View result
head(jobs_data_transformed)
# A tibble: 6 × 4
  job                   year       men     women
  <chr>                <int>     <dbl>     <dbl>
1 Accountant / Auditor  1850 0.000131  0        
2 Accountant / Auditor  1860 0.000214  0        
3 Accountant / Auditor  1870 0.0000998 0        
4 Accountant / Auditor  1880 0.000125  0        
5 Accountant / Auditor  1900 0.000396  0.0000272
6 Accountant / Auditor  1910 0         0        
vl_chart(jobs_data_transformed) |>
  vl_mark_point() |>
  vl_filter("datum.year == 2000 | datum.year == 1990 | datum.year == 1980 | datum.year == 1970") |>
  vl_facet("year:O", title = "", columns = 2) |>
  vl_encode_x("men:Q") |>
  vl_scale_x(type = "symlog", constant = 0.00001) |>  # note: scale must be set quite small
  vl_encode_y("women:Q") |>
  vl_scale_y(type = "symlog", constant = 0.00001) |>  # note: scale must be set quite small
  vl_encode_color("job:N") |>
  vl_encode_tooltip_array(list("job", "men", "women")) |>
  vl_axis_x(format=".0%") |>
  vl_axis_y(format=".0%") |>
  vl_add_properties(
    title =  list( 
      text = "Percent of people working various jobs",
      subtitle = "Each dot represents the one occupation. Hover to see which occupation it is."
      ))

percentage of men and women who worked in various jobs

#join data because jobs_summary contain total % for men and women in each year

jobs_summary <-
  jobs_summary |>
  pivot_wider(names_from = sex, values_from = total_perc, id_cols = year) |>
  rename_with(~ paste0(.x, "_total_perc"), -year)
jobs_data_transformed_join <- left_join(jobs_data_transformed, jobs_summary, by = "year") |>
  mutate(
    perc_men = men / men_total_perc,
    perc_women = women / women_total_perc
  )
vl_chart(jobs_data_transformed_join) |>
  vl_mark_point() |>
  vl_filter("datum.year == 2000 | datum.year == 1990 | datum.year == 1980 | datum.year == 1970") |>
  vl_facet("year:O", title = "", columns = 2) |>
  vl_encode_x("perc_men:Q") |>
  vl_scale_x(type = "symlog", constant = 0.00001) |>  # note: scale must be set quite small
  vl_encode_y("perc_women:Q") |>
  vl_scale_y(type = "symlog", constant = 0.00001) |>  # note: scale must be set quite small
  vl_encode_color("job:N") |>
  vl_encode_tooltip_array(list("job", "perc_men", "perc_women")) |>
  vl_axis_x(format=".0%") |>
  vl_axis_y(format=".0%") |>
  vl_add_properties(
    title =  list( 
      text = "Percent of people working various jobs in proportion to their gender",
      subtitle = "Each dot represents the one occupation. Hover to see which occupation it is."
      ))

#left join vegabrite

jobs_data_transformed |>
vl_lookup(
  lookup = "year",
  from = list(
    data = list(jobs_data_transformed),
    key = "year",
    fields = list("total_perc")
  )
)

Exercise 3 (price at the pump)

gas_prices <- read_csv("https://calvin-data304.netlify.app/data/pump_price_for_gasoline_us_per_liter.csv")

countries <- read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")
#giving both datasets common names
countries <- countries %>% rename(country = name)

joined_countries <- gas_prices %>% inner_join(countries, by = "country")

# Identify unmatched countries
unmatched_country_gas <- gas_prices %>% anti_join(countries, by = "country")  # Countries in gas_prices but not in countries
unmatched_country_countries <- countries %>% anti_join(gas_prices, by = "country")  # Countries in countries but not in gas_prices

# View results
print("Successfully Matched Data:")
[1] "Successfully Matched Data:"
print(joined_countries)
# A tibble: 154 × 37
   country `1991` `1992` `1993` `1994` `1995` `1996` `1997` `1998` `1999` `2000`
   <chr>    <dbl>  <dbl> <lgl>  <lgl>   <dbl> <lgl>  <lgl>   <dbl> <lgl>   <dbl>
 1 Afghan…     NA  NA    NA     NA      NA    NA     NA      NA    NA      NA   
 2 Angola      NA  NA    NA     NA      NA    NA     NA       0.38 NA       0.3 
 3 Albania     NA  NA    NA     NA      NA    NA     NA       0.86 NA       0.57
 4 Andorra     NA  NA    NA     NA      NA    NA     NA      NA    NA      NA   
 5 Argent…     NA   0.79 NA     NA       0.6  NA     NA       0.94 NA       1.07
 6 Armenia     NA  NA    NA     NA      NA    NA     NA       0.49 NA       0.55
 7 Antigu…     NA  NA    NA     NA      NA    NA     NA      NA    NA       0.56
 8 Austra…     NA  NA    NA     NA      NA    NA     NA       0.46 NA       0.57
 9 Austria     NA  NA    NA     NA       1.15 NA     NA       1.04 NA       0.82
10 Azerba…     NA  NA    NA     NA      NA    NA     NA       0.46 NA       0.39
# ℹ 144 more rows
# ℹ 26 more variables: `2001` <lgl>, `2002` <dbl>, `2003` <lgl>, `2004` <dbl>,
#   `2005` <lgl>, `2006` <dbl>, `2007` <lgl>, `2008` <dbl>, `2009` <lgl>,
#   `2010` <dbl>, `2011` <lgl>, `2012` <dbl>, `2013` <lgl>, `2014` <dbl>,
#   `2015` <lgl>, `2016` <chr>, `alpha-2` <chr>, `alpha-3` <chr>,
#   `country-code` <chr>, `iso_3166-2` <chr>, region <chr>, `sub-region` <chr>,
#   `intermediate-region` <chr>, `region-code` <chr>, …
print("Countries in gas_prices but NOT in countries:")
[1] "Countries in gas_prices but NOT in countries:"
print(unmatched_country_gas$country)
 [1] "UAE"              "Bolivia"          "Brunei"           "Cote d'Ivoire"   
 [5] "Congo, Dem. Rep." "Congo, Rep."      "Cape Verde"       "Czech Republic"  
 [9] "UK"               "Hong Kong, China" "Iran"             "Kyrgyz Republic" 
[13] "South Korea"      "Kosovo"           "Lao"              "St. Lucia"       
[17] "Moldova"          "Netherlands"      "North Korea"      "Palestine"       
[21] "Russia"           "Slovak Republic"  "Syria"            "Turkey"          
[25] "Tanzania"         "USA"              "Venezuela"        "Vietnam"         
print("Countries in countries but NOT in gas_data:")
[1] "Countries in countries but NOT in gas_data:"
print(unmatched_country_countries$country)
 [1] "Åland Islands"                                       
 [2] "American Samoa"                                      
 [3] "Anguilla"                                            
 [4] "Antarctica"                                          
 [5] "Aruba"                                               
 [6] "Bermuda"                                             
 [7] "Bolivia, Plurinational State of"                     
 [8] "Bonaire, Sint Eustatius and Saba"                    
 [9] "Bouvet Island"                                       
[10] "British Indian Ocean Territory"                      
[11] "Brunei Darussalam"                                   
[12] "Cabo Verde"                                          
[13] "Cayman Islands"                                      
[14] "Christmas Island"                                    
[15] "Cocos (Keeling) Islands"                             
[16] "Comoros"                                             
[17] "Congo"                                               
[18] "Congo, Democratic Republic of the"                   
[19] "Cook Islands"                                        
[20] "Côte d'Ivoire"                                       
[21] "Curaçao"                                             
[22] "Czechia"                                             
[23] "Dominica"                                            
[24] "Equatorial Guinea"                                   
[25] "Falkland Islands (Malvinas)"                         
[26] "Faroe Islands"                                       
[27] "French Guiana"                                       
[28] "French Southern Territories"                         
[29] "Gibraltar"                                           
[30] "Greenland"                                           
[31] "Guadeloupe"                                          
[32] "Guam"                                                
[33] "Guernsey"                                            
[34] "Guinea-Bissau"                                       
[35] "Heard Island and McDonald Islands"                   
[36] "Holy See"                                            
[37] "Hong Kong"                                           
[38] "Iran, Islamic Republic of"                           
[39] "Isle of Man"                                         
[40] "Jersey"                                              
[41] "Kiribati"                                            
[42] "Korea, Democratic People's Republic of"              
[43] "Korea, Republic of"                                  
[44] "Kyrgyzstan"                                          
[45] "Lao People's Democratic Republic"                    
[46] "Macao"                                               
[47] "Martinique"                                          
[48] "Mayotte"                                             
[49] "Micronesia, Federated States of"                     
[50] "Moldova, Republic of"                                
[51] "Montserrat"                                          
[52] "Netherlands, Kingdom of the"                         
[53] "New Caledonia"                                       
[54] "Niue"                                                
[55] "Norfolk Island"                                      
[56] "Northern Mariana Islands"                            
[57] "Palau"                                               
[58] "Palestine, State of"                                 
[59] "Pitcairn"                                            
[60] "Puerto Rico"                                         
[61] "Réunion"                                             
[62] "Russian Federation"                                  
[63] "Saint Barthélemy"                                    
[64] "Saint Helena, Ascension and Tristan da Cunha"        
[65] "Saint Kitts and Nevis"                               
[66] "Saint Lucia"                                         
[67] "Saint Martin (French part)"                          
[68] "Saint Pierre and Miquelon"                           
[69] "Saint Vincent and the Grenadines"                    
[70] "San Marino"                                          
[71] "Sao Tome and Principe"                               
[72] "Seychelles"                                          
[73] "Sint Maarten (Dutch part)"                           
[74] "Slovakia"                                            
[75] "Solomon Islands"                                     
[76] "South Georgia and the South Sandwich Islands"        
[77] "Svalbard and Jan Mayen"                              
[78] "Syrian Arab Republic"                                
[79] "Taiwan, Province of China"                           
[80] "Tanzania, United Republic of"                        
[81] "Tokelau"                                             
[82] "Tonga"                                               
[83] "Türkiye"                                             
[84] "Turks and Caicos Islands"                            
[85] "Tuvalu"                                              
[86] "United Arab Emirates"                                
[87] "United Kingdom of Great Britain and Northern Ireland"
[88] "United States of America"                            
[89] "United States Minor Outlying Islands"                
[90] "Venezuela, Bolivarian Republic of"                   
[91] "Viet Nam"                                            
[92] "Virgin Islands (British)"                            
[93] "Virgin Islands (U.S.)"                               
[94] "Wallis and Futuna"                                   
[95] "Western Sahara"                                      
country_gas_final <- joined_countries|>
    pivot_longer(cols = 
        c("1991","1992","1993","1994","1995","1996", "1997","1998","1999", "2000","2001", "2002","2003", "2004","2005","2006", "2007", "2008","2009", "2010", "2011","2012", "2013", "2014", "2015", "2016"),
        names_to = "year",
        values_to = "gas_prices",
        values_transform = list(gas_prices = as.numeric))
#cleaned_data_countries <- joined_countries |> drop_na()

vl_chart(country_gas_final)|>
  vl_mark_bar() |>
  vl_encode_x("year:T") |>
  vl_encode_y("gas_prices:Q", title = "gas prices (US $)")|>
  vl_encode_column("sub-region:N", columns = 2)
#my xOffset doesn't seem to be working in grouping the bars together per subregion...

vl_chart(country_gas_final, width = 300) |>
  vl_mark_bar() |>
  vl_filter("datum.year == 2000 | datum.year == 2002 | datum.year == 2004 | datum.year == 2006 | datum.year == 2008 | datum.year == 2010 | datum.year == 2012 | datum.year == 2014 |datum.year == 2016") |>
  vl_encode_x("year:O", title = "Year") |>
  vl_encode_xOffset("sub-region:N") |>  # Group bars by sub-region
  vl_encode_y("gas_prices:Q", title = "Gas Prices (US $)") |>
  vl_facet("region:N", columns = 2) |>   # Facet by region
  vl_encode_color("country:N", title = "Country") |>  # Color bars by country
  vl_encode_tooltip_array(list("country", "gas_prices", "sub-region", "country-code"))